In [160]:
import pandas
import networkx
from matplotlib import pyplot as plt
%matplotlib inline
from IPython.display import set_matplotlib_formats
#set_matplotlib_formats('pdf')
data = pandas.DataFrame.from_csv('../data/github-cran-150601.csv')
data['Date'] = pandas.to_datetime(data['Date'])
data = data.sort('Date')
In [161]:
R_packages = ('R MASS Matrix base boot class cluster codetools compiler datasets foreign grDevices ' +
'graphics grid lattice methods mgcv nlme nnet parallel rpart ' +
'spatial splines stats stats4 survival tcltk tools translations utils').split(' ')
In [162]:
def data_for_date(data, date):
return data.query('InCRAN == 1 and Date <= "{date}"'.format(date=date)).drop_duplicates(('Package'), take_last=True).fillna('').set_index('Package', inplace=False)
In [163]:
def graph_for_data(data, ignore_R = True):
G = networkx.DiGraph()
G.add_nodes_from(list(data.index))
for index, row in data.iterrows():
deps = row['Dependencies'].split(' ')
for dep in deps:
if dep in data.index:
G.add_edge(index, dep)
if ignore_R:
G.remove_nodes_from(R_packages)
return G
In [164]:
def nodes_properties(G):
isolated = networkx.isolates(G)
for node in networkx.topological_sort(G, reverse=True):
min_strata = len(G.node) + 1
max_strata = 0
for successor in G.successors(node):
if G.node[successor]['max_strata'] > max_strata:
max_strata = G.node[successor]['max_strata']
if G.node[successor]['min_strata'] < min_strata:
min_strata = G.node[successor]['min_strata']
if min_strata == len(G.node) + 1:
min_strata = 0
G.node[node]['min_strata'] = min_strata + 1
G.node[node]['max_strata'] = max_strata + 1
packages = pandas.DataFrame.from_dict({n: {'ancestors': len(networkx.ancestors(G, n)),
'descendants': len(networkx.descendants(G, n)),
'out': G.out_degree(n),
'in': G.in_degree(n),
'isolated': n in isolated,
'min_strata': G.node[n]['min_strata'],
'max_strata': G.node[n]['max_strata']} for n in G.nodes_iter()}, orient='index')
return packages
In [165]:
dates = pandas.date_range(start='2010-01', end='2015-01', freq='3M')
In [166]:
from collections import OrderedDict
summary = OrderedDict()
graphs = OrderedDict()
props = OrderedDict()
for date in dates:
G = graph_for_data(data_for_date(data, date))
p = nodes_properties(G)
d = OrderedDict()
d['nodes'] = len(G)
nonisolated = p.query('isolated == False')
d['nonisolated'] = len(nonisolated)
d['max ancestors'] = nonisolated['ancestors'].max()
d['max descendants'] = nonisolated['descendants'].max()
#d['min_strata mean'] = nonisolated['min_strata'].mean()
#d['max_strata mean'] = nonisolated['max_strata'].mean()
d['min_strata <= 1'] = len(nonisolated.query('min_strata <= 1'))
d['max_strata <= 1'] = len(nonisolated.query('max_strata <= 1'))
d['min_strata >= 3'] = len(nonisolated.query('min_strata >= 3'))
d['max_strata >= 3'] = len(nonisolated.query('max_strata >= 3'))
summary[date] = d
graphs[date] = G
props[date] = p
In [167]:
df = pandas.DataFrame.from_dict(summary, orient='index')
df.index = pandas.to_datetime(df.index)
ax = df.plot(style=[None, None, None, None, '--', '--', '--', '--'], title='CRAN Packages')
ax.figure.set_size_inches(16, 8)
In [168]:
df = pandas.DataFrame.from_dict({date: p.query('isolated == False')['min_strata'] for date, p in props.iteritems()})
ax = df.plot(title='Minimal Strata Distribution for CRAN Packages', kind='box')
ax.figure.set_size_inches(16, 4)
df = pandas.DataFrame.from_dict({date: p.query('isolated == False')['max_strata'] for date, p in props.iteritems()})
ax = df.plot(title='Maximal Strata Distribution for CRAN Packages', kind='box')
ax.figure.set_size_inches(16, 4)
df = pandas.DataFrame.from_dict({date: p.query('isolated == False')['descendants'] for date, p in props.iteritems()})
ax = df.plot(title='Descendants Distribution for CRAN Packages', kind='box')
ax.figure.set_size_inches(16, 4)
df = pandas.DataFrame.from_dict({date: p.query('isolated == False')['ancestors'] for date, p in props.iteritems()})
ax = df.plot(title='Ancestors Distribution for CRAN Packages', kind='box')
ax.figure.set_size_inches(16, 4)
In [169]:
df.sort('2014-07-31 00:00:00', ascending=False, inplace=False)
Out[169]:
In [180]:
old_packages = props[props.keys()[0]]
new_data_index = []
new_data = []
for date, ddata in props.iteritems():
for package, metadata in ddata.iterrows():
if package not in old_packages.index:
new_data_index.append(package)
metadata['snapshot_date'] = date
new_data.append(metadata)
In [181]:
df = pandas.DataFrame(new_data, index=new_data_index)
df['snapshot_date'] = pandas.to_datetime(df['snapshot_date'])
In [182]:
github = data.query('InGitHub == 1').drop_duplicates('Package')
In [185]:
ndf = github.set_index('Package')[['InGitHub']].join(df, how='right')
ndf = ndf.fillna({'InGitHub': 0})
In [215]:
groups_GH = ndf.query('InGitHub == 1')[['min_strata', 'max_strata', 'snapshot_date']].groupby(by=['snapshot_date'])
groups_CRAN = ndf.query('InGitHub == 0')[['min_strata', 'max_strata', 'snapshot_date']].groupby(by=['snapshot_date'])
In [223]:
ax = (groups_GH.mean().rename(columns={'min_strata': 'min_strata_GH', 'max_strata': 'max_strata_GH'})
.plot(title='New Packages Strata Average', style=['g', 'r']))
(groups_CRAN.mean().rename(columns={'min_strata': 'min_strata_CRAN', 'max_strata': 'max_strata_CRAN'})
.plot(ax=ax, style=['b--', 'y--']))
ax.set_xlabel('Date')
ax.figure.set_size_inches(16, 6)
In [ ]: